Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path= './data/NCDC/uk/marham/dat.txt' 
# file_path= './data/NCDC/uk/tiree/dat.txt'  # try 4
# file_path= './data/NCDC/uk/boscombe_down/dat.txt' # 4?, numpy bug
# file_path= './data/NCDC/uk/middle_wallop/dat.txt' 
# file_path= './data/NCDC/uk/southhamption/dat.txt' # high 0, trend
# file_path= './data/NCDC/uk/bournemouth/dat.txt' # 4?
# file_path= "./data/NCDC/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?

# file_path= "./data/NCDC/southeast_asia/singapore_changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/singapore_seletar/dat.txt"
# file_path= "./data/NCDC/sultan_mahmud/dat.txt" # Stable
# file_path= "./data/NCDC/southeast_asia/sultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path= "./data/NCDC/europe/landsberg_lech/dat.txt" # very good, can try 4
# file_path= "./data/NCDC/europe/neuburg/dat.txt"
# file_path= "./data/NCDC/europe/valladolid/dat.txt"
# file_path= "./data/NCDC/europe/laupheim/dat.txt" # double peak, 4; very good, trend
# file_path= "./data/NCDC/europe/avord/dat.txt" # try 4, initial speed (should be good with m/s)
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/holzdorf/dat.txt" # 2008 year
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= "./data/NCDC/europe/barayas/dat.txt" # numpy problem
# file_path= "./data/NCDC/europe/vatry/dat.txt"  # double peak, initial speed (should be good with m/s), mixed report type
# file_path= './data/NCDC/europe/tenerife_sur/dat.txt'  # some directions are blocked

# file_path= "./data/NCDC/oceania/auckland_intl/dat.txt"  # Good data, Weird KDE shape, might be blocked?
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/canberra/dat.txt" # high 0, numpy problem

file_path= "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt" 
# file_path= "./data/NCDC/cn/shanghai/pudong/dat.txt"
# file_path= "./data/NCDC/cn/nanjing_lukou/dat.txt" 
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/hefei_luogang/dat.txt" # few 0, trend
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0

# file_path = 'TOP/hr_avg.csv' # High 0
# file_path = './data/asos/denver/hr_avg.csv'

# file_path = './data/asos/bismarck_ND/hr_avg.csv' # try 4?
# file_path = './data/asos/aberdeen_SD/hr_avg.csv' # only to 2012, good fit, try 2
# file_path = './data/asos/minneapolis/hr_avg.csv'

# file_path = './data/asos/lincoln_NE/hr_avg.csv' 
# file_path = './data/asos/des_moines_IA/hr_avg.csv'
# file_path = './data/asos/springfield_IL/hr_avg.csv' # good fit
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    integer_data = False
    knot_unit = True
In [4]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) & \
              (date >= 19700000) & (date < 20170000) ")
In [5]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [6]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
df['month'] = df['date']%10000//100
# Convert Windrose coordianates to Polar Cooridinates 
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
df.describe()
Out[6]:
date HrMn dir speed month dir_windrose
count 3.369710e+05 336971.000000 336971.000000 336971.000000 336971.000000 336971.000000
mean 2.000006e+07 1128.070137 268.228497 3.543890 6.536892 248.807883
std 1.115119e+05 691.735348 275.924866 2.002307 3.448796 279.529540
min 1.973010e+07 0.000000 0.000000 0.000000 1.000000 0.000000
25% 1.991120e+07 530.000000 90.000000 2.000000 4.000000 90.000000
50% 2.002062e+07 1100.000000 200.000000 3.000000 7.000000 150.000000
75% 2.010050e+07 1730.000000 320.000000 5.000000 10.000000 310.000000
max 2.015030e+07 2357.000000 999.000000 30.000000 12.000000 999.000000
In [7]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0xcf45860>

1.2.1 Unit Detection

In [8]:
speed_unit_text = ' (knot)'
dir_unit_text = ' (degree)'
if 'knot_unit' not in globals():
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
    knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False

    if knot_unit:
        df['speed'] = df['speed'] * 1.943845
        df['decimal'] = df.speed % 1
        df.decimal.hist(alpha=0.5, label='knot')
        # need more elaboration, some is not near an integer
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
        speed_unit_text = ' (knot)'
    else:
        speed_unit_text = ' (m/s)'
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
    df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
False

1.2.2 Sampling Type Selection

In [9]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.2.3 Sampling Time Selection

In [10]:
MID_YEAR = (min(df.date)//10000+max(df.date)//10000)//2

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df.query('date > @MID_YEAR * 10000')['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='> %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [11]:
df['sample_time'] = df.HrMn % 100 
sample_time = df.query('date > 20000000')['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
df = df.query("sample_time in @sample_times")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0, 30]
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0xd2d0b70>

1.3 Data Wrangling

1.3.1 Artefacts

1.3.1.1 wrong direction record

In [12]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type month dir_windrose
time
1994-01-28 00:00:00 19940128 0 FM-15 119 3.0 N 1 331
1994-07-18 10:00:00 19940718 1000 FM-15 337 5.0 N 7 113
1994-08-05 11:00:00 19940805 1100 FM-15 335 9.0 N 8 115
1994-08-10 05:00:00 19940810 500 FM-15 319 10.0 N 8 131
1994-09-03 21:00:00 19940903 2100 FM-15 331 5.0 N 9 119
1994-12-03 14:00:00 19941203 1400 FM-15 316 3.0 N 12 134
1995-04-03 13:00:00 19950403 1300 FM-15 337 3.0 N 4 113
1998-06-03 11:00:00 19980603 1100 FM-15 59 10.0 N 6 31
1998-09-09 12:00:00 19980909 1200 FM-15 359 20.0 N 9 91

1.3.1.2 sudden increase in speed

In [13]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
1993-12-21 22:00:00 19931221 2200 FM-15 110 22.0 N 12 340 19.0 20.0
1993-08-02 14:00:00 19930802 1400 FM-15 140 21.0 N 8 310 17.0 21.0
2013-08-01 07:30:00 20130801 730 FM-15 170 17.0 N 8 280 12.0 7.0
2012-08-08 05:30:00 20120808 530 FM-15 0 17.0 N 8 90 2.0 4.0
1985-04-30 08:00:00 19850430 800 FM-15 310 16.0 N 4 140 9.8 9.8
2005-09-11 17:00:00 20050911 1700 FM-15 0 16.0 N 9 90 4.0 4.0
1981-12-19 07:00:00 19811219 700 FM-15 140 15.9 N 12 310 13.9 3.1
1994-10-20 08:00:00 19941020 800 FM-15 140 15.0 N 10 310 6.0 8.0
2012-08-08 05:00:00 20120808 500 FM-15 10 15.0 N 8 80 1.0 -2.0
2012-08-08 03:30:00 20120808 330 FM-15 10 15.0 N 8 80 2.0 0.0
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xcf8c550>
In [14]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 4
date HrMn type dir speed wind_type month dir_windrose incre incre_reverse
time
2013-08-01 07:30:00 20130801 730 FM-15 170 17.0 N 8 280 12.0 7.0
2012-08-08 05:30:00 20120808 530 FM-15 0 17.0 N 8 90 2.0 4.0
1985-04-30 08:00:00 19850430 800 FM-15 310 16.0 N 4 140 9.8 9.8
2005-09-11 17:00:00 20050911 1700 FM-15 0 16.0 N 9 90 4.0 4.0
1981-12-19 07:00:00 19811219 700 FM-15 140 15.9 N 12 310 13.9 3.1
2012-08-08 05:00:00 20120808 500 FM-15 10 15.0 N 8 80 1.0 -2.0
2012-08-08 04:00:00 20120808 400 FM-15 0 15.0 N 8 90 0.0 1.0
2012-08-08 02:30:00 20120808 230 FM-15 10 15.0 N 8 80 1.0 2.0
1994-10-20 08:00:00 19941020 800 FM-15 140 15.0 N 10 310 6.0 8.0
2012-08-08 03:30:00 20120808 330 FM-15 10 15.0 N 8 80 2.0 0.0

1.3.2 0 Speed

In [15]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df.query("(date >= 20050000)"))
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0386598990298

1.3.3 Direction re-aligment and 999

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,30,50], need to redistribute the angle into 22.5

In [16]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       6646
10      4810
20      5215
30      5799
40      6022
50      7703
60      8228
70      7032
80      7825
90     11834
100    10494
110     8819
120     9918
130     8737
140     5394
150     4686
160     4245
170     3374
180     2864
190     2063
200     2145
210     2574
220     2430
230     2399
240     2616
250     2844
260     3795
270     5435
280     6328
290     8989
300    14076
310    12280
320    12851
330    12939
340    10251
350     8242
999    31349
Name: dir, dtype: int64
36 10.0
In [17]:
df=realign_direction(df, effective_column)
df=fill_direction_999(df, SECTOR_LENGTH)

1.4 Time Shift Comparison

In [18]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df.query('date < @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [19]:
df.query('date < @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df.query('date > @MID_YEAR * 10000')['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [20]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1979 - 1979
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1980 - 1984
1985 - 1989
1990 - 1994
1995 - 1999
2000 - 2004
2005 - 2009
2010 - 2014
2015 - 2015
In [21]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[21]:
(0, 6.0)
In [22]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type month dir_windrose
time
In [23]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 5000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [24]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 5000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.5 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [25]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [26]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.6 Generate (x,y) from (speed,dir)

In [27]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [28]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [29]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years.query('(date >= 20100000) & (date < 20150000)')
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? False
Report type used: FM-15
Sampling time used: [0, 30]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[29]:
date HrMn dir speed month dir_windrose x y
count 8.718500e+04 87185.000000 87185.000000 87185.000000 87185.000000 87185.000000 87185.000000 87185.000000
mean 2.012065e+07 1164.638413 184.770611 4.507226 6.524723 211.968148 1.028676 0.401863
std 1.413991e+04 692.254061 114.331417 1.982570 3.449085 239.037972 3.014952 3.733133
min 2.010010e+07 0.000000 -4.998457 0.000653 1.000000 0.000000 -16.611133 -12.124493
25% 2.011040e+07 530.000000 84.624646 3.139312 4.000000 80.000000 -0.918124 -2.429540
50% 2.012070e+07 1130.000000 167.011763 4.369490 7.000000 140.000000 1.389170 0.085924
75% 2.013093e+07 1730.000000 302.841613 5.732700 10.000000 280.000000 3.307444 3.335600
max 2.014123e+07 2330.000000 354.999600 17.736579 12.000000 999.000000 17.735881 14.494497
In [30]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x19975da0>
In [31]:
# df['date'].apply(lambda x: str(x)[:-2]).value_counts().sort_index().plot(kind='bar', figsize=(20,4))
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b592390>
In [32]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [33]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [34]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [35]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2. Overview by Direction

In [36]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [37]:
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

2.3 Overview by Month

In [38]:
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 12+month_incre, month_incre): 
    end_month = month+month_incre
    sub_df = current_df.query('(month >= @month) and (month < @end_month)')
    if len(sub_df) > 0:
        if month_incre == 1:
            title = 'Month: %s' % (month)
        else:
            title = 'Month: %s - %s ' % (month, end_month-1)
        ax = WindroseAxes.from_ax()
        ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
        plt_configure(figsize=(3,3), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

3. Create input data and configuration

In [39]:
SPEED_SET = array(list(zip(df.x, df.y)))
NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [40]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8]
In [41]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [42]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [43]:
%%time
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV  ## too slow

# The bandwidth value sometimes would be too radical
if knot_unit:
    bandwidth_range = arange(0.7,2,0.2)
else:
    bandwidth_range = arange(0.4,1,0.1)

# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:    
    df_resample=df.sample(n=50000, replace=True)
    bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
    bandwidth_search_sample = sample

grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
0.4
Wall time: 2min 46s
In [44]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.4 289
[  3.99276592e-08   1.54027761e-06   3.23131394e-06   9.61342825e-06
   4.06210140e-05]
In [45]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(4,3))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [46]:
kde_cdf = cdf_from_pdf(kde_result)

5. GMM by Expectation-maximization

In [47]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [48]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[48]:
weight mean_x mean_y sig_x sig_y corr
1 0.435 2.821 -2.559 1.868 2.222 0.052
2 0.289 1.542 3.898 2.359 2.383 -0.215
3 0.276 -2.330 1.408 2.231 3.023 -0.163
In [49]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.434894486376 [[ 2.82126504 -2.55936623]] [ 1.85995169  2.22947008] 171.72875109
0.288828714416 [[ 1.54178636  3.89806463]] [ 2.10005596  2.61387929] -136.366093348
0.276276799208 [[-2.32950819  1.40816669]] [ 2.16911214  3.06756107] -166.032439147
In [50]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [51]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[51]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.932 0.026 0.044 0.000001 0.055 0.299

6. GMM by Optimization

In [52]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [53]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[53]:
     fun: -14.218483557959413
     jac: array([  9.27222610e-01,  -4.76837158e-07,   1.19209290e-07,
        -3.57627869e-07,  -2.38418579e-07,   1.19209290e-07,
         9.27221775e-01,   0.00000000e+00,  -1.19209290e-07,
        -1.19209290e-07,  -3.57627869e-07,   1.19209290e-07,
         9.27218556e-01,   0.00000000e+00,  -2.38418579e-07,
         0.00000000e+00,   2.38418579e-07,  -3.57627869e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 992
     nit: 49
    njev: 49
  status: 0
 success: True
       x: array([ 0.29662769,  3.39971286, -1.93760355,  1.49173542,  2.47432193,
        0.22260085,  0.30303232,  0.15577865, -1.99280165,  3.10215589,
        2.27329548, -0.50301054,  0.40033999,  0.00440606,  3.78548574,
        2.91003882,  2.01538398, -0.04041075])

6.1 GMM Result

In [54]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[54]:
weight mean_x mean_y sig_x sig_y corr
1 0.400 0.004 3.785 2.910 2.015 -0.040
2 0.303 0.156 -1.993 3.102 2.273 -0.503
3 0.297 3.400 -1.938 1.492 2.474 0.223
In [55]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.400339991497 [[ 0.00440606  3.78548574]] [ 2.01222818  2.91222188] -93.0698019464
0.303032316395 [[ 0.15577865 -1.99280165]] [ 1.79075736  3.40359098] -118.935289241
0.296627692108 [[ 3.39971286 -1.93760355]] [ 1.43496666  2.50767108] 168.568098158

6.2 Goodness-of-fit statistics

In [56]:
gof_df(gmm_pdf_result, kde_result)
Out[56]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.041 6.683301e-07 0.044 0.240
In [57]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(4,3))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [58]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
In [59]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 3. GMM distribution
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
In [60]:
# Calculate Speed Distribution
# 1. GMM Model
x = arange(0, max_speed, 0.5)
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])*len(df.speed)/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm,'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4.5, 2))
plt.gca().set_ylim(bottom = 0)
plt.locator_params(axis='y', nbins=5)
Speed Distribution Comparison
In [61]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
title='Direction Distribution Comparison'
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4.5,2))
dir_fig = plt.gcf()
print(title)
Direction Distribution Comparison
In [62]:
# %%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.925183449428

6.3 Sectoral Comaprison

In [63]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    max_diff_array = []
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = radians(angle), radians(incre)  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. Make Plots
        fig = plt.figure(figsize=(13,1.3))
        # 3.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
#         plt_configure(xlabel = "$V$", ylabel='Frequency', legend=True)
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 3.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
#         plt_configure(xlabel = "$V$", ylabel='$P$', legend=True)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 3.3. Weibull Comparison
        ax3 = fig.add_subplot(1,3,3)
        plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
        plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
        plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
        plt.gca().set_xlim(right = log(max_speed+1))
#         plt_configure(xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        diff, diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
        max_diff_array.append([len(sub_df), angle, diff.max(), x[diff.argmax()], 
                               diff_weibull.max(), x[diff_weibull.argmax()], R_square_gmm, R_square_weibull])
        curves = {'angle': angle, 'data_size': data_size, 'weight': direction_prob, 
                  'x': x, 'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf}
        curve_collection.append(curves)
        
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', diff.max(), diff_weibull.max(), 'speed value:', x[diff.argmax()], 'y gmm', y_cdf_gmm[diff.argmax()])
        print(' ')
    return max_diff_array, curve_collection
In [64]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
max_diff_array, curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:46: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:47: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:48: RuntimeWarning: divide by zero encountered in log
5.0 (-5.0 - 15.0) degree
data size: 2817 weight 0.03231060388828354
GMM Weibull
R square 0.980132759899 0.956769615074
max diff: 0.0429320498754 0.0363172838231 speed value: 1.86700832414 y gmm 0.0844655962013
 
25.0 (15.0 - 35.0) degree
data size: 4304 weight 0.04936629007283363
GMM Weibull
R square 0.900585997327 0.938488926753
max diff: 0.0726272914214 0.0542291949042 speed value: 5.09275556099 y gmm 0.660179400028
 
45.0 (35.0 - 55.0) degree
data size: 5222 weight 0.059895624247290245
GMM Weibull
R square 0.963840909336 0.974364378258
max diff: 0.0367177967195 0.123374804262 speed value: 3.15767348773 y gmm 0.226874824678
 
65.0 (55.0 - 75.0) degree
data size: 5109 weight 0.05859952973561966
GMM Weibull
R square 0.972152627736 0.980626366128
max diff: 0.0615148992534 0.106942891987 speed value: 6.15390472161 y gmm 0.773787359617
 
85.0 (75.0 - 95.0) degree
data size: 7989 weight 0.09163273498881688
GMM Weibull
R square 0.937659516237 0.967558057769
max diff: 0.0816673566212 0.069445026352 speed value: 6.90584719273 y gmm 0.880265428971
 
105.0 (95.0 - 115.0) degree
data size: 6317 weight 0.07245512416126627
GMM Weibull
R square 0.891305633486 0.958424938933
max diff: 0.0908353851757 0.081510630991 speed value: 4.34429672732 y gmm 0.419533460795
 
125.0 (115.0 - 135.0) degree
data size: 5915 weight 0.06784423926134082
GMM Weibull
R square 0.95941119472 0.957578515304
max diff: 0.061338773154 0.114977124022 speed value: 3.13316344692 y gmm 0.210789322605
 
145.0 (135.0 - 155.0) degree
data size: 2973 weight 0.03409990250616505
GMM Weibull
R square 0.933843026282 0.968675220138
max diff: 0.0548930210082 0.144307826538 speed value: 5.87264164126 y gmm 0.716514278997
 
165.0 (155.0 - 175.0) degree
data size: 2991 weight 0.034306360038997535
GMM Weibull
R square 0.965550826816 0.960912593977
max diff: 0.0229660656487 0.0526445506334 speed value: 4.4861225991 y gmm 0.580176695969
 
185.0 (175.0 - 195.0) degree
data size: 2175 weight 0.024946951883924987
GMM Weibull
R square 0.92260956142 0.929377952199
max diff: 0.0629853207948 0.0613828537515 speed value: 3.59778095492 y gmm 0.507589391849
 
205.0 (195.0 - 215.0) degree
data size: 2166 weight 0.024843723117508747
GMM Weibull
R square 0.965856173588 0.959980554442
max diff: 0.0263079315897 0.0977042108103 speed value: 0.584176928649 y gmm 0.0189367590844
 
225.0 (215.0 - 235.0) degree
data size: 2128 weight 0.024407868325973504
GMM Weibull
R square 0.95349794334 0.977461577247
max diff: 0.0549477197506 0.0424138599737 speed value: 1.30273981939 y gmm 0.10482671634
 
245.0 (235.0 - 255.0) degree
data size: 1711 weight 0.01962493548202099
GMM Weibull
R square 0.937646008559 0.990260357379
max diff: 0.05915851097 0.0404040042574 speed value: 1.34460600629 y gmm 0.107410746774
 
265.0 (255.0 - 275.0) degree
data size: 3089 weight 0.03543040660664105
GMM Weibull
R square 0.932720032897 0.988696268868
max diff: 0.0759754576976 0.0890760477601 speed value: 4.61095666992 y gmm 0.755159659705
 
285.0 (275.0 - 295.0) degree
data size: 5395 weight 0.0618799105350691
GMM Weibull
R square 0.973181615383 0.982384495443
max diff: 0.0424007256908 0.0927805346123 speed value: 3.73558787855 y gmm 0.363530692289
 
305.0 (295.0 - 315.0) degree
data size: 9983 weight 0.1145036416814819
GMM Weibull
R square 0.993505235981 0.99191061297
max diff: 0.0191772629631 0.100416848973 speed value: 6.69839774541 y gmm 0.837768868693
 
325.0 (315.0 - 335.0) degree
data size: 9280 weight 0.10644032803807994
GMM Weibull
R square 0.970816150444 0.975324583668
max diff: 0.0482424729281 0.0634701772978 speed value: 5.02283479151 y gmm 0.601973044313
 
345.0 (335.0 - 355.0) degree
data size: 6506 weight 0.07462292825600735
GMM Weibull
R square 0.976122820692 0.963063579419
max diff: 0.0491892354106 0.0377715078735 speed value: 2.9181256928 y gmm 0.208273157944
 
Wall time: 1min 4s
In [65]:
diff_df = pd.DataFrame(max_diff_array,columns=['datasize','direction', 'gmm', 'speed_gmm',
                                               'weibull', 'speed_weibull', 'r_square_gmm', 'r_square_weibull'])  

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.85)
plt.gca().set_ylim(top=1, bottom=ylim)
print(gmm_mean, weibull_mean)
0.9554914681857765 0.9699423725890005
In [66]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.gmm, diff_df.weibull, diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.15)
plt.gca().set_ylim(top=ylim, bottom=0)
print(gmm_mean, weibull_mean)
0.05338944330948914 0.08109741295675635
In [67]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [68]:
max_diff_element = max(max_diff_array, key=lambda x: x[2])
angle =  max_diff_angle = max_diff_element[1]
incre = rebinned_angle
In [69]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [70]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(13,1.5))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (
    
    angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)

diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
105.0 (95.0 - 115.0) Degree Speed Distribution
0.0912551492315 4.0 0.350568501236

6.4.2 Time Variability

In [71]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(20000000, 20150000, 50000):
    end_time = start_time + 50000 
    time_label = start_time//10000
    df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = time_label)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
        title = '%s - %s' %(time_label, time_label+4)
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=time_label, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = time_label*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if time_label == 2010 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if time_label == 2010 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: RuntimeWarning: divide by zero encountered in log
105.0 (95.0 - 115.0) Degree Speed Distribution

6.4.3 Adjacent Sector Variability

In [72]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [73]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('angle == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['data_size'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
105.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [74]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
speed_unit_text=' (knot)'

7.1 Variability of the Result

In [75]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.404 -0.039 3.749 2.889 2.023 -0.031
2 0.304 3.349 -1.999 1.499 2.616 0.241
3 0.291 0.211 -1.979 3.138 2.087 -0.472
GMM Plot Result
0.404137512449 [[-0.03938441  3.74942569]] [ 2.02093717  2.89039568] -92.4639752075
0.304482111268 [[ 3.34857403 -1.99871838]] [ 1.43537205  2.65130037] 168.834079506
0.291380376283 [[ 0.2109411  -1.97903989]] [ 1.72314854  3.35155634] -114.197894416
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.955 0.014 0.047 6.966436e-07 0.044 0.245
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.046 6.724880e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.405 -0.086 3.759 2.893 2.033 -0.003
2 0.306 3.353 -1.997 1.506 2.646 0.255
3 0.290 0.193 -1.925 3.123 2.095 -0.461
GMM Plot Result
0.404557951679 [[-0.08606258  3.75865831]] [ 2.03309848  2.89288251] -90.2614529176
0.305803982867 [[ 3.35318225 -1.99735927]] [ 1.43528659  2.68508046] 168.381195799
0.289638065454 [[ 0.19261917 -1.92465674]] [ 1.74158157  3.33310178] -114.193172029
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.954 0.012 0.049 6.947780e-07 0.045 0.245
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.045 6.712098e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.410 -0.043 3.742 2.919 2.036 -0.019
2 0.306 3.386 -2.075 1.486 2.583 0.227
3 0.284 0.082 -1.909 3.048 2.055 -0.464
GMM Plot Result
0.410474913474 [[-0.04323724  3.7422801 ]] [ 2.03514015  2.91906126] -91.4866402173
0.305926984778 [[ 3.38649814 -2.07520315]] [ 1.4293104   2.61425103] 169.329941868
0.283598101748 [[ 0.08228998 -1.90905158]] [ 1.7027567   3.25803494] -114.449433255
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.955 0.014 0.051 6.918989e-07 0.045 0.244
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.049 6.731353e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.403 0.139 3.782 2.934 2.006 -0.082
2 0.321 0.179 -2.129 3.159 2.523 -0.560
3 0.276 3.418 -1.861 1.455 2.221 0.205
GMM Plot Result
0.403293079444 [[ 0.13906781  3.78165863]] [ 1.99374509  2.94233416] -95.9799063734
0.321146918765 [[ 0.17906472 -2.12851656]] [ 1.83283911  3.60370103] -123.986192656
0.275560001791 [[ 3.41771174 -1.86125783]] [ 1.40373752  2.25365973] 167.400134879
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.041 6.701489e-07 0.043 0.240
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.040 6.719277e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.401 0.114 3.770 2.957 2.029 -0.092
2 0.325 0.215 -2.132 3.084 2.490 -0.561
3 0.274 3.450 -1.863 1.454 2.244 0.182
GMM Plot Result
0.401171215597 [[ 0.11396036  3.76975378]] [ 2.01330568  2.96782555] -96.6890669451
0.325090431228 [[ 0.21450744 -2.13217858]] [ 1.79978816  3.53162862] -124.480959831
0.273738353175 [[ 3.44983282 -1.86299468]] [ 1.41366093  2.26934237] 168.956993026
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.955 0.013 0.044 6.866935e-07 0.044 0.243
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.041 6.717531e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.401 -0.111 3.775 2.854 2.029 0.007
2 0.306 3.402 -2.002 1.481 2.668 0.239
3 0.293 0.253 -1.980 3.066 2.096 -0.445
GMM Plot Result
0.401063506686 [[-0.11061442  3.77470391]] [ 2.02901023  2.85424138] -89.4007085256
0.306050114912 [[ 3.40168696 -2.00154622]] [ 1.42008318  2.70052213] 169.493083575
0.292886378402 [[ 0.2528831  -1.97984327]] [ 1.76007088  3.27111026] -114.399451513
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.952 0.014 0.051 7.268737e-07 0.046 0.250
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.043 6.721101e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.399 0.055 3.796 2.877 2.011 -0.037
2 0.302 3.401 -1.964 1.499 2.506 0.211
3 0.299 0.006 -1.925 3.128 2.289 -0.517
GMM Plot Result
0.398827842682 [[ 0.05468742  3.79589771]] [ 2.00839998  2.8792171 ] -92.9169819226
0.301972377468 [[ 3.40115638 -1.96416671]] [ 1.44831612  2.53612884] 169.251660732
0.29919977985 [[ 0.0056976 -1.9248539]] [ 1.77982507  3.44284992] -119.221831895
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.955 0.014 0.043 6.819572e-07 0.045 0.243
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.041 6.738137e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.395 -0.085 3.782 2.883 2.009 -0.017
2 0.316 3.393 -2.040 1.522 2.653 0.238
3 0.288 0.075 -1.871 3.064 2.103 -0.461
GMM Plot Result
0.395300445986 [[-0.084541    3.78237409]] [ 2.00845046  2.88346395] -91.3134679156
0.3164031371 [[ 3.39346701 -2.0400266 ]] [ 1.45940093  2.68799106] 168.924709681
0.288296416914 [[ 0.07464513 -1.87114093]] [ 1.74197596  3.28205475] -115.044659327
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.953 0.013 0.048 7.097866e-07 0.045 0.247
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.045 6.714535e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.391 -0.125 3.817 2.843 1.998 -0.007
2 0.311 3.388 -1.856 1.478 2.680 0.216
3 0.299 0.191 -1.956 3.113 2.137 -0.484
GMM Plot Result
0.390690937989 [[-0.12477417  3.81732378]] [ 1.9981431   2.84351161] -90.5829208615
0.310512074081 [[ 3.38803274 -1.85562793]] [ 1.42879596  2.70620153] 170.546192272
0.298796987929 [[ 0.19071373 -1.9561542 ]] [ 1.7359583   3.35348573] -115.760442027
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.047 6.673968e-07 0.044 0.240
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.046 6.743583e-07 0.044 0.241

weight mean_x mean_y sig_x sig_y corr
1 0.402 0.044 3.795 2.917 2.029 -0.048
2 0.306 0.134 -2.030 3.129 2.362 -0.524
3 0.292 3.397 -1.901 1.466 2.418 0.209
GMM Plot Result
0.401599257833 [[ 0.04382183  3.79483665]] [ 2.0244333   2.92058036] -93.7204466016
0.306462623012 [[ 0.13433705 -2.03000888]] [ 1.81027975  3.47737871] -120.74672034
0.291938119155 [[ 3.39660761 -1.90056889]] [ 1.41613262  2.44788877] 169.064156118
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.014 0.040 6.620588e-07 0.043 0.239
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.956 0.013 0.039 6.701976e-07 0.044 0.240
Wall time: 19.9 s

7.2 Cross-validation, to select the number of Gaussian

In [76]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset 65388.75 21796.25
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.145385 0.072275 0.000004 0.107476 0.587242 0.736719
1 0.143751 0.072794 0.000004 0.106175 0.586279 0.738507
2 0.146320 0.072206 0.000004 0.108712 0.588563 0.735069
3 0.144439 0.071532 0.000004 0.107988 0.587137 0.736325
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.147678 0.071002 0.000004 0.107965 0.587869 0.737170
1 0.153707 0.072764 0.000004 0.113220 0.596823 0.726018
2 0.144492 0.072285 0.000004 0.104921 0.587035 0.738835
3 0.150763 0.073433 0.000004 0.107038 0.591450 0.735193
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.067793 0.033206 0.000001 0.057496 0.312482 0.925540
1 0.064308 0.033013 0.000001 0.056777 0.311445 0.925791
2 0.068700 0.033136 0.000001 0.056908 0.310573 0.926457
3 0.070178 0.032780 0.000001 0.057846 0.315786 0.923838
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.071042 0.030434 0.000001 0.058008 0.320921 0.921372
1 0.081186 0.030753 0.000001 0.059765 0.321580 0.921722
2 0.072695 0.035810 0.000001 0.059883 0.327219 0.918254
3 0.066659 0.035242 0.000001 0.056922 0.310767 0.926597
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.042183 0.013818 6.703640e-07 0.043746 0.240359 0.956059
1 0.045414 0.013298 6.810901e-07 0.044492 0.242386 0.955459
2 0.039147 0.014644 6.710621e-07 0.044297 0.240698 0.955535
3 0.041105 0.013501 6.693157e-07 0.043982 0.240175 0.955713
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.048541 0.017420 7.363469e-07 0.047121 0.252256 0.951002
1 0.054267 0.016427 7.185707e-07 0.045302 0.248848 0.951886
2 0.053387 0.011898 7.457092e-07 0.045737 0.253177 0.951989
3 0.043836 0.015369 7.312701e-07 0.046084 0.251371 0.952675
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.034457 0.012413 5.662516e-07 0.041218 0.220997 0.962342
1 0.035858 0.011116 5.941374e-07 0.040791 0.226372 0.960965
2 0.036580 0.011615 5.905007e-07 0.041094 0.225622 0.961263
3 0.036842 0.011596 6.256025e-07 0.042737 0.232289 0.958990
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.038713 0.014380 6.473773e-07 0.041071 0.236236 0.958796
1 0.043370 0.014784 7.054603e-07 0.047533 0.246609 0.953436
2 0.047962 0.012188 7.087888e-07 0.046103 0.247376 0.952969
3 0.040946 0.015089 6.076840e-07 0.041382 0.228883 0.959605
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.035535 0.012153 5.124178e-07 0.037813 0.210183 0.966431
1 0.050830 0.017129 3.882528e-07 0.033817 0.182996 0.974321
2 0.045139 0.013867 4.821136e-07 0.037312 0.203985 0.968128
3 0.049583 0.016642 3.865405e-07 0.033801 0.182522 0.974641
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.041808 0.008592 5.445294e-07 0.042008 0.216805 0.963728
1 0.096641 0.022605 4.609877e-07 0.035581 0.199344 0.970166
2 0.050280 0.010871 6.191453e-07 0.042460 0.230800 0.959861
3 0.058600 0.015422 4.453498e-07 0.034795 0.196159 0.970438
Wall time: 54.7 s
In [77]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.144974 0.072202 3.999655e-06 0.107588 0.587305 0.736655
2 0.067745 0.033034 1.132953e-06 0.057257 0.312571 0.925406
3 0.041962 0.013815 6.729580e-07 0.044129 0.240904 0.955692
4 0.035934 0.011685 5.941230e-07 0.041460 0.226320 0.960890
5 0.045272 0.014948 4.423312e-07 0.035686 0.194922 0.970880
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.149160 0.072371 4.047505e-06 0.108286 0.590794 0.734304
2 0.072895 0.033060 1.188763e-06 0.058645 0.320122 0.921987
3 0.050008 0.015278 7.329742e-07 0.046061 0.251413 0.951888
4 0.042748 0.014110 6.673276e-07 0.044022 0.239776 0.956202
5 0.061832 0.014372 5.175030e-07 0.038711 0.210777 0.966048
In [78]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [79]:
fig = plt.figure(figsize=(5,2.5))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 
output_HTML(current_file, output_file)